on shtooka 4876 dataset
# Initialize Notebook
from IPython.core.display import HTML,Image
#%run ../library/v1.0.5/init.ipy
HTML('''<script> code_show=true; function code_toggle() { if (code_show){ $('div.input').hide(); } else { $('div.input').show(); } code_show = !code_show } $( document ).ready(code_toggle); </script> <form action="javascript:code_toggle()"><input type="submit" value="Toggle Code"></form>''')
#import tensorflow as tf
#print (tf.__version__)
import re
#from tensorflow.examples.tutorials.mnist import input_data
from matplotlib import offsetbox
import gc, argparse, sys, os, errno
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set()
#sns.set_style('whitegrid')
import h5py
from PIL import Image
import os
from tqdm import tqdm_notebook as tqdm
import scipy
import sklearn
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.io import loadmat
import IPython.display as ipd
import IPython
import librosa.display
import librosa
def audio_stft(audio,n_fft=64):
X = librosa.stft(audio,n_fft=64)
return librosa.amplitude_to_db(abs(X))
cd ..
ls data/shtooka/wav4876/reconstruct_wave/
trainpath = 'data/ddsp/audio/recon/withz/train/'
testpath = 'data/ddsp/audio/recon/withz/test/'
trainfilenum = 160
testfilenum = 150
nmels = 33
length = 4001
train_audio = np.ndarray([trainfilenum,64000])
train_audio_recon = np.ndarray([trainfilenum,64000])
train_spec = np.ndarray([trainfilenum,334, 385])
train_spec_recon = np.ndarray([trainfilenum,334, 385])
melspec_train = np.ndarray([trainfilenum,nmels, length])
melspec_train_recon = np.ndarray([trainfilenum,nmels, length])
for i in tqdm(range(trainfilenum)):
train_audio[i] = librosa.core.load(trainpath+'audio_'+str(i)+'.wav',sr=16000)[0]
train_audio_recon[i] = librosa.core.load(trainpath+'recon_audio'+str(i)+'.wav',sr=16000)[0]
train_spec[i] = np.load(trainpath+'spec_'+str(i)+'.npy')
train_spec_recon[i] = np.load(trainpath+'recon_spec_'+str(i)+'.npy')
melspec_train[i] = audio_stft(train_audio[i])
melspec_train_recon[i] = audio_stft(train_audio_recon[i])
#melspec_train_recon[i] = librosa.feature.melspectrogram(y=train_audio_recon[i], sr=16000, n_mels=nmels,fmax=8000)
test_audio = np.ndarray([testfilenum,64000])
test_audio_recon = np.ndarray([testfilenum,64000])
test_spec = np.ndarray([testfilenum,334, 385])
test_spec_recon = np.ndarray([testfilenum,334, 385])
melspec_test = np.ndarray([testfilenum,nmels, length])
melspec_test_recon = np.ndarray([testfilenum,nmels, length])
for i in tqdm(range(testfilenum)):
test_audio[i] = librosa.core.load(testpath+'audio_'+str(i)+'.wav',sr=16000)[0]
test_audio_recon[i] = librosa.core.load(testpath+'recon_audio'+str(i)+'.wav',sr=16000)[0]
test_spec[i] = np.load(testpath+'spec_'+str(i)+'.npy')
test_spec_recon[i] = np.load(testpath+'recon_spec_'+str(i)+'.npy')
melspec_test[i] = audio_stft(test_audio[i])
melspec_test_recon[i] = audio_stft(test_audio_recon[i])
#melspec_test[i] = librosa.feature.melspectrogram(y=test_audio[i], sr=16000, n_mels=nmels,fmax=8000)
#melspec_test_recon[i] = librosa.feature.melspectrogram(y=test_audio_recon[i], sr=16000, n_mels=nmels,fmax=8000)
def MSE_pcc(A,B,ax=None):
mse =np.mean(((A - B)**2/B.var()))
#mse = (np.square(A - B)).mean(axis=ax)
pcc = pearsonr(A.ravel(),B.ravel())[0]
return mse,pcc
def analyze(predict,GT_STFT_test_spkr):
samples = predict.shape[0]
pcc = np.zeros([samples])
mse = np.zeros([samples])
for i in tqdm(range(samples)):
mse[i], pcc[i] = MSE_pcc(predict[i],GT_STFT_test_spkr[i])
fig,ax=plt.subplots(1,2,figsize=(16,4))
ax[0].hist(mse,bins=25,color='b')
ax[0].set_title('MSE: %g(%g)' %(np.round(mse.mean(),3),np.round(mse.std(),3)))
ax[1].hist(pcc,bins=50,color='g')
ax[1].set_title('PCC: %g(%g)' %(np.round(pcc.mean(),3),np.round(pcc.std(),3)))
return mse,pcc
GT_STFT_train_spkr, predict_train = train_spec,train_spec_recon
mse,pcc=analyze(predict_train,GT_STFT_train_spkr)
GT_STFT_test_spkr, predict = test_spec,test_spec_recon
mse,pcc=analyze(predict,GT_STFT_test_spkr)
#MSE_pcc(predict , GT_STFT_test_spkr)
mse,pcc=analyze(melspec_test,melspec_test_recon)
rownum = 2
columnnum = 8
tmptest = GT_STFT_test_spkr[:rownum *columnnum]
tmppred = predict[:rownum *columnnum]
tmpvisarr = [ np.concatenate((tmptest[i].T, tmppred[i].T)) for i in range(tmptest.shape[0])]
fig,ax=plt.subplots(rownum,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
for j in range(rownum):
ax[j,i].imshow(tmpvisarr[columnnum*j+i],cmap=cm.Blues)
#ax[j*2,i].imshow(GT_STFT_test_spkr[np.argsort(-pcc)[columnnum*j+i]],cmap=cm.Blues)
#ax[j*2+1,i].imshow( predict[np.argsort(-pcc)[columnnum*j+i]],cmap=cm.Blues)
plt.tight_layout()
rownum = 2
columnnum = 8
fig,ax=plt.subplots(rownum*2,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
for j in range(rownum):
ax[j*2,i].imshow(GT_STFT_test_spkr[columnnum*j+i].T,cmap=cm.Blues)
ax[j*2+1,i].imshow( predict[columnnum*j+i].T,cmap=cm.Blues)
plt.tight_layout()
rownum = 2
columnnum = 8
fig,ax=plt.subplots(rownum*2,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
for j in range(rownum):
librosa.display.specshow(melspec_test[columnnum*j+i],
sr=16000,ax= ax[j*2,i])
librosa.display.specshow(melspec_test_recon[columnnum*j+i],
sr=16000,ax= ax[j*2+1,i])
plt.tight_layout()
rownum = 2
columnnum = 8
fig,ax=plt.subplots(rownum*2,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
for j in range(rownum):
ax[j*2,i].imshow(GT_STFT_train_spkr[columnnum*j+i].T,cmap=cm.Blues)
ax[j*2+1,i].imshow( predict_train[columnnum*j+i].T,cmap=cm.Blues)
plt.tight_layout()
rownum = 2
columnnum = 8
fig,ax=plt.subplots(rownum*2,columnnum,figsize=(columnnum*2,rownum *4))
for i in range(columnnum):
for j in range(rownum):
librosa.display.specshow(GT_STFT_train_spkr[columnnum*j+i].T,
sr=16000,ax= ax[j*2,i])
librosa.display.specshow(GT_STFT_train_spkr[columnnum*j+i].T,
sr=16000,ax= ax[j*2+1,i])
plt.tight_layout()
ind = 2
fig,ax=plt.subplots(2,1,figsize=(25,18))
librosa.display.waveplot(test_audio[ind], sr=16000,ax=ax[0])
librosa.display.waveplot(test_audio_recon[ind], sr=16000,ax=ax[1])
ax[0].set_title('Ground Truth Audio',fontsize=22)
ax[1].set_title('DDSP Generated Audio',fontsize=22)
ax[0].set_ylim(-0.8,0.8)
ax[1].set_ylim(-0.8,0.8)
fig.tight_layout()
display(ipd.Audio(test_audio[ind],rate=16000))
display(ipd.Audio(test_audio_recon[ind],rate=16000))
ind = 10
fig,ax=plt.subplots(2,1,figsize=(25,18))
librosa.display.waveplot(test_audio[ind], sr=16000,ax=ax[0])
librosa.display.waveplot(test_audio_recon[ind], sr=16000,ax=ax[1])
ax[0].set_title('Ground Truth Audio',fontsize=22)
ax[1].set_title('DDSP Generated Audio',fontsize=22)
ax[0].set_ylim(-0.8,0.8)
ax[1].set_ylim(-0.8,0.8)
fig.tight_layout()
display(ipd.Audio(test_audio[ind],rate=16000))
display(ipd.Audio(test_audio_recon[ind],rate=16000))
ind = 21
fig,ax=plt.subplots(2,1,figsize=(25,18))
librosa.display.waveplot(test_audio[ind], sr=16000,ax=ax[0])
librosa.display.waveplot(test_audio_recon[ind], sr=16000,ax=ax[1])
ax[0].set_title('Ground Truth Audio',fontsize=22)
ax[1].set_title('DDSP Generated Audio',fontsize=22)
ax[0].set_ylim(-0.8,0.8)
ax[1].set_ylim(-0.8,0.8)
fig.tight_layout()
display(ipd.Audio(test_audio[ind],rate=16000))
display(ipd.Audio(test_audio_recon[ind],rate=16000))